Loading both training and testing data
Teach<-read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
Ythu<-read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(Teach);
## [1] 19622 160
dim(Ythu)
## [1] 20 160
teach_devide <- createDataPartition(Teach$classe, p = 0.8, list = F)
Val_data <- Teach[-teach_devide,]
Teach <- Teach[teach_devide,]
dim(Teach);
## [1] 15699 160
dim(Val_data)
## [1] 3923 160
table(Teach$classe)/nrow(Teach)
##
## A B C D E
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
kl <- sapply(select(Teach,names(Teach)[grepl("_belt",names(Teach))]),function(x) sum(is.na(x)))
kl
## roll_belt pitch_belt yaw_belt
## 0 0 0
## total_accel_belt kurtosis_roll_belt kurtosis_picth_belt
## 0 15391 15415
## kurtosis_yaw_belt skewness_roll_belt skewness_roll_belt.1
## 15699 15390 15415
## skewness_yaw_belt max_roll_belt max_picth_belt
## 15699 15385 15385
## max_yaw_belt min_roll_belt min_pitch_belt
## 15391 15385 15385
## min_yaw_belt amplitude_roll_belt amplitude_pitch_belt
## 15391 15385 15385
## amplitude_yaw_belt var_total_accel_belt avg_roll_belt
## 15391 15385 15385
## stddev_roll_belt var_roll_belt avg_pitch_belt
## 15385 15385 15385
## stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 15385 15385 15385
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## 15385 15385 0
## gyros_belt_y gyros_belt_z accel_belt_x
## 0 0 0
## accel_belt_y accel_belt_z magnet_belt_x
## 0 0 0
## magnet_belt_y magnet_belt_z
## 0 0
vk <- sapply(select(Teach,names(Teach)[grepl("_arm",names(Teach))]),function(x) sum(is.na(x)))
vk
## roll_arm pitch_arm yaw_arm total_accel_arm
## 0 0 0 0
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## 15385 15385 15385 15385
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 15385 15385 15385 15385
## stddev_yaw_arm var_yaw_arm gyros_arm_x gyros_arm_y
## 15385 15385 0 0
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## 0 0 0 0
## magnet_arm_x magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 0 15449
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm
## 15451 15395 15448 15451
## skewness_yaw_arm max_roll_arm max_picth_arm max_yaw_arm
## 15395 15385 15385 15385
## min_roll_arm min_pitch_arm min_yaw_arm amplitude_roll_arm
## 15385 15385 15385 15385
## amplitude_pitch_arm amplitude_yaw_arm
## 15385 15385
cg <- sapply(select(Teach,names(Teach)[grepl("_forearm",names(Teach))]),function(x) sum(is.na(x)))
cg
## roll_forearm pitch_forearm yaw_forearm
## 0 0 0
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 15448 15449 15699
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 15447 15449 15699
## max_roll_forearm max_picth_forearm max_yaw_forearm
## 15385 15385 15448
## min_roll_forearm min_pitch_forearm min_yaw_forearm
## 15385 15385 15448
## amplitude_roll_forearm amplitude_pitch_forearm amplitude_yaw_forearm
## 15385 15385 15448
## total_accel_forearm var_accel_forearm avg_roll_forearm
## 0 15385 15385
## stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 15385 15385 15385
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 15385 15385 15385
## stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## 15385 15385 0
## gyros_forearm_y gyros_forearm_z accel_forearm_x
## 0 0 0
## accel_forearm_y accel_forearm_z magnet_forearm_x
## 0 0 0
## magnet_forearm_y magnet_forearm_z
## 0 0
rj <- sapply(select(Teach,names(Teach)[grepl("_dumbbell",names(Teach))]),function(x) sum(is.na(x)))
rj
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## 0 0 0
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 15389 15387 15699
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 15388 15386 15699
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell
## 15385 15385 15389
## min_roll_dumbbell min_pitch_dumbbell min_yaw_dumbbell
## 15385 15385 15389
## amplitude_roll_dumbbell amplitude_pitch_dumbbell amplitude_yaw_dumbbell
## 15385 15385 15389
## total_accel_dumbbell var_accel_dumbbell avg_roll_dumbbell
## 0 15385 15385
## stddev_roll_dumbbell var_roll_dumbbell avg_pitch_dumbbell
## 15385 15385 15385
## stddev_pitch_dumbbell var_pitch_dumbbell avg_yaw_dumbbell
## 15385 15385 15385
## stddev_yaw_dumbbell var_yaw_dumbbell gyros_dumbbell_x
## 15385 15385 0
## gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## 0 0 0
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## 0 0 0
## magnet_dumbbell_y magnet_dumbbell_z
## 0 0
rcb <- c(names(kl[kl!=0]), names(vk[vk!=0]),names(cg[cg!=0]),names(rj[rj!=0]))
length(rcb)
## [1] 100
foni_fg<-tbl_df(Teach%>%select(-rcb,-c(X,user_name,raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp,new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(rcb)` instead of `rcb` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
foni_fg$classe<-as.factor(foni_fg$classe)
foni_fg[,1:52]<-lapply(foni_fg[,1:52],as.numeric)
dim(foni_fg)
## [1] 15699 53
o_o <- cor(select(foni_fg, -classe))
diag(o_o) <- 0
o_o <- which(abs(o_o)>0.8,arr.ind = T)
o_o <- unique(row.names(o_o))
corrplot(cor(select(foni_fg,o_o)),type="upper",order="hclust",method="number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(o_o)` instead of `o_o` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
zxn <-foni_fg%>%binarize(n_bins=4,thresh_infreq=0.01)
ms <- zxn %>% correlate(target=classe__A)
ar<-zxn%>%correlate(target=classe__B)
ws <- zxn%>%correlate(target=classe__C)
iu<-zxn%>%correlate(target=classe__D)
hj<-zxn %>% correlate(target = classe__E)
a_pol <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", "roll_forearm", "gyros_dumbbell_y")
b_pol <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
c_pol <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
d_pol <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
e_pol <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
jp <- character()
for(c in c(a_pol,b_pol,c_pol,d_pol,e_pol)){
jp <- union(jp,c)
}
foni_fg2 <- foni_fg%>%select(jp,classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(jp)` instead of `jp` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",jp)),"forearm"=sum(grepl("_forearm",jp)),"belt"=sum(grepl("_belt",jp)),"dumbbell"=sum(grepl("_dumbbell",jp)))
## arm forearm belt dumbbell
## 1 2 4 4 7
k_b<-function(data, mapping, ...) {
ggplot(data = data, mapping=mapping)+geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
k_n<-function(data, mapping, ...) {
ggplot(data = data, mapping=mapping)+geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(foni_fg2,columns = 1:5,aes(color = classe),lower = list(continuous = k_n),diag = list(continuous = k_b))
ggpairs(foni_fg2,columns=6:10,aes(color=classe),lower=list(continuous=k_n),diag =list(continuous=k_b))
ggpairs(foni_fg2,columns = 11:17,aes(color = classe),lower=list(continuous=k_n),diag=list(continuous=k_b))
TeachF <- Teach %>% select(jp,classe)
Thv_infoF<-Val_data %>% select(jp,classe)
TeachF[,1:17] <- sapply(TeachF[,1:17],as.numeric)
Thv_infoF[,1:17] <- sapply(Thv_infoF[,1:17],as.numeric)
thm<-c("A", "B", "C", "D", "E")
abb <- preProcess(TeachF[,-18],method = c("center","scale","BoxCox"))
Se_x <- predict(abb,select(TeachF,-classe))
Se_y <- factor(TeachF$classe,levels=thm)
W_x <- predict(abb,select(Thv_infoF,-classe))
W_y<- factor(Thv_infoF$classe,levels=thm)
K_tr <- trainControl(method="cv", number=5)
KT_n <- train(x = Se_x,y = Se_y,method = "rpart", trControl = K_tr)
WF_n <- train(x = Se_x,y = Se_y, method = "rf", trControl = K_tr,verbose=FALSE, metric = "Accuracy")
ERF_n <- train(x = Se_x,y = Se_y,method = "gbm",trControl=K_tr,verbose=FALSE)
RGN_n <- svm(x = Se_x,y = Se_y,kernel = "polynomial", cost = 10)
confusionMatrix(predict(KT_n,W_x),W_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1013 305 321 299 100
## B 17 260 18 123 79
## C 83 194 345 221 210
## D 0 0 0 0 0
## E 3 0 0 0 332
##
## Overall Statistics
##
## Accuracy : 0.4971
## 95% CI : (0.4813, 0.5128)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3428
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9077 0.34256 0.50439 0.0000 0.46047
## Specificity 0.6348 0.92509 0.78141 1.0000 0.99906
## Pos Pred Value 0.4971 0.52314 0.32764 NaN 0.99104
## Neg Pred Value 0.9454 0.85435 0.88188 0.8361 0.89158
## Prevalence 0.2845 0.19347 0.17436 0.1639 0.18379
## Detection Rate 0.2582 0.06628 0.08794 0.0000 0.08463
## Detection Prevalence 0.5195 0.12669 0.26842 0.0000 0.08539
## Balanced Accuracy 0.7713 0.63383 0.64290 0.5000 0.72977
confusionMatrix(predict(WF_n,W_x),W_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1115 13 0 1 0
## B 0 731 4 3 1
## C 1 13 675 15 1
## D 0 2 5 624 2
## E 0 0 0 0 717
##
## Overall Statistics
##
## Accuracy : 0.9845
## 95% CI : (0.9801, 0.9881)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9803
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9991 0.9631 0.9868 0.9705 0.9945
## Specificity 0.9950 0.9975 0.9907 0.9973 1.0000
## Pos Pred Value 0.9876 0.9892 0.9574 0.9858 1.0000
## Neg Pred Value 0.9996 0.9912 0.9972 0.9942 0.9988
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2842 0.1863 0.1721 0.1591 0.1828
## Detection Prevalence 0.2878 0.1884 0.1797 0.1614 0.1828
## Balanced Accuracy 0.9971 0.9803 0.9888 0.9839 0.9972
plot(WF_n$finalModel,main="Error VS no of tree")
confusionMatrix(predict(ERF_n,W_x),W_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1094 47 1 6 3
## B 12 619 37 8 3
## C 6 59 632 47 12
## D 4 30 14 578 8
## E 0 4 0 4 695
##
## Overall Statistics
##
## Accuracy : 0.9223
## 95% CI : (0.9134, 0.9304)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9016
##
## Mcnemar's Test P-Value : 2.082e-12
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9803 0.8155 0.9240 0.8989 0.9639
## Specificity 0.9797 0.9810 0.9617 0.9829 0.9975
## Pos Pred Value 0.9505 0.9116 0.8360 0.9117 0.9886
## Neg Pred Value 0.9921 0.9568 0.9836 0.9802 0.9919
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2789 0.1578 0.1611 0.1473 0.1772
## Detection Prevalence 0.2934 0.1731 0.1927 0.1616 0.1792
## Balanced Accuracy 0.9800 0.8983 0.9428 0.9409 0.9807
confusionMatrix(predict(RGN_n,W_x),W_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1106 49 15 22 3
## B 1 664 14 3 3
## C 4 41 643 50 5
## D 5 2 7 565 11
## E 0 3 5 3 699
##
## Overall Statistics
##
## Accuracy : 0.9373
## 95% CI : (0.9292, 0.9447)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9205
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9910 0.8748 0.9401 0.8787 0.9695
## Specificity 0.9683 0.9934 0.9691 0.9924 0.9966
## Pos Pred Value 0.9255 0.9693 0.8654 0.9576 0.9845
## Neg Pred Value 0.9963 0.9707 0.9871 0.9766 0.9932
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2819 0.1693 0.1639 0.1440 0.1782
## Detection Prevalence 0.3046 0.1746 0.1894 0.1504 0.1810
## Balanced Accuracy 0.9797 0.9341 0.9546 0.9355 0.9830
Ythu2 <- Ythu %>% select(jp,problem_id)
xYthu <- Ythu2 %>% select(jp)
result <- data.frame("problem_id" = Ythu$problem_id,"PREDICTION_RF"=predict(WF_n,xYthu),"PREDICTION_GBM"=predict(ERF_n,xYthu),"PREDICTION_SVM"=predict(RGN_n,xYthu))
result
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A E B
## 4 4 E E B
## 5 5 E E A
## 6 6 E D A
## 7 7 E E B
## 8 8 B D B
## 9 9 A E E
## 10 10 E E E
## 11 11 A E B
## 12 12 A B A
## 13 13 B B E
## 14 14 A A B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E A
## 18 18 B E A
## 19 19 E E A
## 20 20 E E D